In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import scipy.stats as stats
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, log_loss
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
#¶

PART-A¶

#¶

1. Data Understanding¶

A. Read all the 3 CSV files as DataFrame and store them into 3 separate variables.¶

In [2]:
# Reading Normal.csv
type_normal=pd.read_csv('Normal.csv')
type_normal.head()
Out[2]:
P_incidence P_tilt L_angle S_slope P_radius S_Degree Class
0 38.505273 16.964297 35.112814 21.540976 127.632875 7.986683 Normal
1 54.920858 18.968430 51.601455 35.952428 125.846646 2.001642 Normal
2 44.362490 8.945435 46.902096 35.417055 129.220682 4.994195 Normal
3 48.318931 17.452121 48.000000 30.866809 128.980308 -0.910941 Normal
4 45.701789 10.659859 42.577846 35.041929 130.178314 -3.388910 Normal
In [3]:
# Reading Type_H.csv
type_h=pd.read_csv('Type_H.csv')
type_h.head()
Out[3]:
P_incidence P_tilt L_angle S_slope P_radius S_Degree Class
0 63.027817 22.552586 39.609117 40.475232 98.672917 -0.254400 Type_H
1 39.056951 10.060991 25.015378 28.995960 114.405425 4.564259 Type_H
2 68.832021 22.218482 50.092194 46.613539 105.985135 -3.530317 Type_H
3 69.297008 24.652878 44.311238 44.644130 101.868495 11.211523 Type_H
4 49.712859 9.652075 28.317406 40.060784 108.168725 7.918501 Type_H
In [4]:
# Reading Type_S.csv
type_s=pd.read_csv('Type_S.csv')
type_s.head()
Out[4]:
P_incidence P_tilt L_angle S_slope P_radius S_Degree Class
0 74.377678 32.053104 78.772013 42.324573 143.560690 56.125906 Type_S
1 89.680567 32.704435 83.130732 56.976132 129.955476 92.027277 Type_S
2 44.529051 9.433234 52.000000 35.095817 134.711772 29.106575 Type_S
3 77.690577 21.380645 64.429442 56.309932 114.818751 26.931841 Type_S
4 76.147212 21.936186 82.961502 54.211027 123.932010 10.431972 Type_S

B. Print Shape and columns of all the 3 DataFrames.¶

In [5]:
print("Shape of normal class", type_normal.shape)
type_normal.info()
Shape of normal class (100, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   P_incidence  100 non-null    float64
 1   P_tilt       100 non-null    float64
 2   L_angle      100 non-null    float64
 3   S_slope      100 non-null    float64
 4   P_radius     100 non-null    float64
 5   S_Degree     100 non-null    float64
 6   Class        100 non-null    object 
dtypes: float64(6), object(1)
memory usage: 5.6+ KB

Normal class dataset contains 100 data with 7 columns.

In [6]:
print("Shape of type_h class", type_h.shape)
type_h.info()
Shape of type_h class (60, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   P_incidence  60 non-null     float64
 1   P_tilt       60 non-null     float64
 2   L_angle      60 non-null     float64
 3   S_slope      60 non-null     float64
 4   P_radius     60 non-null     float64
 5   S_Degree     60 non-null     float64
 6   Class        60 non-null     object 
dtypes: float64(6), object(1)
memory usage: 3.4+ KB

Type_H class dataset contains 60 data with 7 columns.

In [7]:
print("Shape of type_s class", type_s.shape)
type_s.info()
Shape of type_s class (150, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   P_incidence  150 non-null    float64
 1   P_tilt       150 non-null    float64
 2   L_angle      150 non-null    float64
 3   S_slope      150 non-null    float64
 4   P_radius     150 non-null    float64
 5   S_Degree     150 non-null    float64
 6   Class        150 non-null    object 
dtypes: float64(6), object(1)
memory usage: 8.3+ KB

Type_S class dataset contains 150 data with 7 columns.

C. Compare Column names of all the 3 DataFrames and clearly write observations.¶

In [8]:
print("Columns of Normal class Dataset:-", ", ".join(type_normal.columns))
print("Columns of Type_H class Dataset:-", ", ".join(type_h.columns))
print("Columns of Type_S class Dataset:-", ", ".join(type_s.columns))
Columns of Normal class Dataset:- P_incidence, P_tilt, L_angle, S_slope, P_radius, S_Degree, Class
Columns of Type_H class Dataset:- P_incidence, P_tilt, L_angle, S_slope, P_radius, S_Degree, Class
Columns of Type_S class Dataset:- P_incidence, P_tilt, L_angle, S_slope, P_radius, S_Degree, Class
In [9]:
def compare_list(l1,l2,l3):
   l1.sort()
   l2.sort()
   l3.sort()
   if l1==l2==l3:
      return 'Identical columns'
   return 'Non-Identical columns'

print("Column comparison for Normal, Type_H and Type_S class Dataset:-", compare_list(type_normal.columns.tolist(), type_h.columns.tolist(), type_s.columns.tolist()))
Column comparison for Normal, Type_H and Type_S class Dataset:- Identical columns

All 3 datasets have same columnName

D. Print DataTypes of all the 3 DataFrames.¶

In [10]:
print("DataType of Normal class Dataset")
type_normal.dtypes
DataType of Normal class Dataset
Out[10]:
P_incidence    float64
P_tilt         float64
L_angle        float64
S_slope        float64
P_radius       float64
S_Degree       float64
Class           object
dtype: object

In Normal class Dataset, 6 columns i.e. P_incidence, P_tilt, L_angle, S_slope, P_radius & S_Degree have float64 as dataType and 1 column i.e. Class as object dataType.

In [11]:
print("DataType of Type_H class Dataset")
type_h.dtypes
DataType of Type_H class Dataset
Out[11]:
P_incidence    float64
P_tilt         float64
L_angle        float64
S_slope        float64
P_radius       float64
S_Degree       float64
Class           object
dtype: object

In Type_H class Dataset, 6 columns i.e. P_incidence, P_tilt, L_angle, S_slope, P_radius & S_Degree have float64 as dataType and 1 column i.e. Class as object dataType.

In [12]:
print("DataType of Type_S class Dataset")
type_s.dtypes
DataType of Type_S class Dataset
Out[12]:
P_incidence    float64
P_tilt         float64
L_angle        float64
S_slope        float64
P_radius       float64
S_Degree       float64
Class           object
dtype: object

In Type_S class Dataset, 6 columns i.e. P_incidence, P_tilt, L_angle, S_slope, P_radius & S_Degree have float64 as dataType and 1 column i.e. Class as object dataType.

E. Observe and share variation in ‘Class’ feature of all the 3 DaraFrames.¶

In [13]:
print("Values of 'Class' feature in Normal class Dataset distributed as:-", ', '.join(type_normal.Class.unique()))
Values of 'Class' feature in Normal class Dataset distributed as:- Normal, Nrmal

'Class' feature of Normal class dataset don't have formatted values, which will be changed to all as 'normal'.

In [14]:
print("Values of 'Class' feature in Type_H class Dataset distributed as:-", ', '.join(type_h.Class.unique()))
Values of 'Class' feature in Type_H class Dataset distributed as:- Type_H, type_h

'Class' feature of Type_H class dataset don't have formatted values, which will be changed to all as 'type_h'.

In [15]:
print("Values of 'Class' feature in Type_S class Dataset distributed as:-", ', '.join(type_s.Class.unique()))
Values of 'Class' feature in Type_S class Dataset distributed as:- Type_S, tp_s

'Class' feature of Type_S class dataset don't have formatted values, which will be changed to all as 'type_s'.

2. Data Preparation and Exploration¶

A. Unify all the variations in ‘Class’ feature for all the 3 DataFrames.¶

In [16]:
# Unifying values for 'Class' feature column in all dataframes
type_normal.Class='normal'
type_s.Class='type_s'
type_h.Class='type_h'
In [17]:
print("Values of 'Class' feature in Normal class Dataset has been unified:-", ', '.join(type_normal.Class.unique()))
Values of 'Class' feature in Normal class Dataset has been unified:- normal
In [18]:
print("Values of 'Class' feature in Type_H class Dataset has been unified:-", ', '.join(type_h.Class.unique()))
Values of 'Class' feature in Type_H class Dataset has been unified:- type_h
In [19]:
print("Values of 'Class' feature in Type_S class Dataset has been unified:-", ', '.join(type_s.Class.unique()))
Values of 'Class' feature in Type_S class Dataset has been unified:- type_s

Values have been formatted in all dataframes and are in ready to use state.

B. Combine all the 3 DataFrames to form a single DataFrame.¶

In [20]:
patients=type_normal.append([type_h, type_s])
patients.reset_index(drop=True, inplace=True)
patients.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 310 entries, 0 to 309
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   P_incidence  310 non-null    float64
 1   P_tilt       310 non-null    float64
 2   L_angle      310 non-null    float64
 3   S_slope      310 non-null    float64
 4   P_radius     310 non-null    float64
 5   S_Degree     310 non-null    float64
 6   Class        310 non-null    object 
dtypes: float64(6), object(1)
memory usage: 17.1+ KB

After combining all dataframes, we got resultant dataframe with 310 rows and 7 columns. Let's change Class dType from object to category, once encoded the lables.

In [21]:
patients.Class=patients.Class.astype('category')
patients.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 310 entries, 0 to 309
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   P_incidence  310 non-null    float64 
 1   P_tilt       310 non-null    float64 
 2   L_angle      310 non-null    float64 
 3   S_slope      310 non-null    float64 
 4   P_radius     310 non-null    float64 
 5   S_Degree     310 non-null    float64 
 6   Class        310 non-null    category
dtypes: category(1), float64(6)
memory usage: 15.1 KB
In [22]:
patients.head()
Out[22]:
P_incidence P_tilt L_angle S_slope P_radius S_Degree Class
0 38.505273 16.964297 35.112814 21.540976 127.632875 7.986683 normal
1 54.920858 18.968430 51.601455 35.952428 125.846646 2.001642 normal
2 44.362490 8.945435 46.902096 35.417055 129.220682 4.994195 normal
3 48.318931 17.452121 48.000000 30.866809 128.980308 -0.910941 normal
4 45.701789 10.659859 42.577846 35.041929 130.178314 -3.388910 normal

C. Print 5 random samples of this DataFrame.¶

In [23]:
patients.sample(n=5, random_state=1)
Out[23]:
P_incidence P_tilt L_angle S_slope P_radius S_Degree Class
78 74.565015 15.724320 58.618582 58.840695 105.417304 0.599247 normal
244 60.044177 14.309656 58.038865 45.734521 105.131664 30.409133 type_s
185 45.443750 9.906072 45.000000 35.537678 163.071041 20.315315 type_s
70 50.086153 13.430044 34.457541 36.656108 119.134622 3.089484 normal
120 43.922840 14.177959 37.832547 29.744881 134.461016 6.451648 type_h

Printed 5 random samples of patients dataframe with n as 5 and random_state as 1 to get same state for every sample hits.

D. Print Feature-wise percentage of Null values.¶

In [24]:
# To check the missing data percentage
patients.isnull().mean()*100
Out[24]:
P_incidence    0.0
P_tilt         0.0
L_angle        0.0
S_slope        0.0
P_radius       0.0
S_Degree       0.0
Class          0.0
dtype: float64
In [25]:
(patients.isnull().sum()/patients.shape[0])*100
Out[25]:
P_incidence    0.0
P_tilt         0.0
L_angle        0.0
S_slope        0.0
P_radius       0.0
S_Degree       0.0
Class          0.0
dtype: float64

No missing data found, that's why 0%.

E. Check 5-point summary of the new DataFrame.¶

In [26]:
patients.describe(include='all').T
Out[26]:
count unique top freq mean std min 25% 50% 75% max
P_incidence 310.0 NaN NaN NaN 60.496653 17.23652 26.147921 46.430294 58.691038 72.877696 129.834041
P_tilt 310.0 NaN NaN NaN 17.542822 10.00833 -6.554948 10.667069 16.357689 22.120395 49.431864
L_angle 310.0 NaN NaN NaN 51.93093 18.554064 14.0 37.0 49.562398 63.0 125.742385
S_slope 310.0 NaN NaN NaN 42.953831 13.423102 13.366931 33.347122 42.404912 52.695888 121.429566
P_radius 310.0 NaN NaN NaN 117.920655 13.317377 70.082575 110.709196 118.268178 125.467674 163.071041
S_Degree 310.0 NaN NaN NaN 26.296694 37.559027 -11.058179 1.603727 11.767934 41.287352 418.543082
Class 310 3 type_s 150 NaN NaN NaN NaN NaN NaN NaN
In [27]:
plt.figure(figsize=(20,10))
sns.boxplot(data=patients)
plt.yticks(fontsize=12, weight='bold')
plt.xticks(fontsize=12, weight='bold')
plt.show();
In [28]:
box=px.box(patients.drop(columns=['Class']), orientation='h')
box.show();
In [29]:
patients.mean(),patients.median()
Out[29]:
(P_incidence     60.496653
 P_tilt          17.542822
 L_angle         51.930930
 S_slope         42.953831
 P_radius       117.920655
 S_Degree        26.296694
 dtype: float64,
 P_incidence     58.691038
 P_tilt          16.357689
 L_angle         49.562398
 S_slope         42.404912
 P_radius       118.268178
 S_Degree        11.767934
 dtype: float64)

Summarising 5-point summary:

  • P_incidence
    • Mean(60.496653) & Median(58.691038) are approximately same with 3 values as outliers on right whiskers.
  • P_tilt
    • Mean(17.542822) & Median(16.357689) are approximately same with few values as outliers on right whiskers.
  • L_angle
    • Mean(51.930930) & Median(49.562398) are approximately same with 1 value as outlier on right whiskers.
  • S_slope
    • Mean(42.953831) & Median(42.404912) are approximately same with 1 value as outlier on right whiskers.
  • P_radius
    • Mean(117.920655) & Median(118.268178) are approximately same with few values as outlier on both left and right whiskers.
  • S_Degree
    • Mean(26.296694) is greater than Median(11.767934) with few values as outlier on right whiskers.

3. Data Analysis¶

A. Visualize a heatmap to understand correlation between all features.¶

In [30]:
patients.corr()
Out[30]:
P_incidence P_tilt L_angle S_slope P_radius S_Degree
P_incidence 1.000000 0.629199 0.717282 0.814960 -0.247467 0.638743
P_tilt 0.629199 1.000000 0.432764 0.062345 0.032668 0.397862
L_angle 0.717282 0.432764 1.000000 0.598387 -0.080344 0.533667
S_slope 0.814960 0.062345 0.598387 1.000000 -0.342128 0.523557
P_radius -0.247467 0.032668 -0.080344 -0.342128 1.000000 -0.026065
S_Degree 0.638743 0.397862 0.533667 0.523557 -0.026065 1.000000
In [31]:
plt.figure(figsize=(15,7))
sns.heatmap(patients.corr(), annot=True, cmap='YlGnBu', lw=1)
plt.yticks(fontsize=12, weight='bold', rotation=45)
plt.xticks(fontsize=12, weight='bold', rotation=45)
plt.title(label='Correlation Heatmap', fontsize=18, weight='bold')
plt.show();

P_incidence have high correlation with S_Degree, S_slope, L_angle, P_tilt.

Some data have negative correlation also.

B. Share insights on correlation.¶

In [32]:
patients.corr()
Out[32]:
P_incidence P_tilt L_angle S_slope P_radius S_Degree
P_incidence 1.000000 0.629199 0.717282 0.814960 -0.247467 0.638743
P_tilt 0.629199 1.000000 0.432764 0.062345 0.032668 0.397862
L_angle 0.717282 0.432764 1.000000 0.598387 -0.080344 0.533667
S_slope 0.814960 0.062345 0.598387 1.000000 -0.342128 0.523557
P_radius -0.247467 0.032668 -0.080344 -0.342128 1.000000 -0.026065
S_Degree 0.638743 0.397862 0.533667 0.523557 -0.026065 1.000000

A. Features having stronger correlation with correlation value.¶

B. Features having weaker correlation with correlation value.¶

Correlation coefficients are used to measure the strength of the relationship between two variables where values always range between -1 (strong negative relationship) and +1 (strong positive relationship). Values at or close to zero imply a weak or no linear relationship. Correlation coefficient values less than +0.8 or greater than -0.8 are not considered significant. So as per this:

  • P_incidence is highly correlated with S_slope with value as 0.814960.
  • P_radius have negative correlation with S_slope with value as -0.342128.
  • P_tilt & P_radius have weak correlation with value as 0.032668, same S_Degree & P_radius have weak correlation with value as -0.026065.

C. Visualize a pairplot with 3 classes distinguished by colors and share insights.¶

In [33]:
plt.figure(figsize=(20,10))
sns.pairplot(patients, hue='Class', corner=True)
plt.yticks(fontsize=12, weight='bold')
plt.xticks(fontsize=12, weight='bold')
plt.show();
<Figure size 1440x720 with 0 Axes>
  • Type_S is highly compared
  • Class with S_Degree, shows right skewness

D. Visualize a jointplot for ‘P_incidence’ and ‘S_slope’ and share insights.¶

In [34]:
plt.figure(figsize=(20,10))
sns.jointplot(data=patients, x='P_incidence', y='S_slope', kind='reg')
plt.show();
<Figure size 1440x720 with 0 Axes>
  • Datapoints for P_incidence & S_slope are mainly situated near best fit line, that's why highly correlated.

E. Visualize a boxplot to check distribution of the features and share insights.¶

In [35]:
box=px.box(patients,x='P_incidence', y='Class')
box.show();
  • P_incidence in class as 'type_s', contain 3 outliers at right whiskers side.
  • P_incidence in class as 'type_h', contain 1 outlier at right whiskers side.
  • P_incidence in class as 'normal', contain 1 outlier at right whiskers side.
  • P_incidence values are larger for class 'type_s'
In [36]:
box=px.box(patients,x='P_tilt', y='Class')
box.show();
  • P_tilt in class as 'type_s', contain no outlier.
  • P_tilt in class as 'type_h', contain 1 outlier at right whiskers side.
  • P_tilt in class as 'normal', contain 2 outliers, 1 at right whiskers side and another at left whisker side.
In [37]:
box=px.box(patients,x='L_angle', y='Class')
box.show();
  • L_angle in class as 'type_s', contain 1 outlier at right whiskers side.
  • L_angle in class as 'type_h', contain 1 outlier at right whiskers side.
  • L_angle in class as 'normal', contain 1 outlier, 1 at right whiskers side.
  • L_angle values are larger for class 'type_s'
In [38]:
box=px.box(patients,x='S_slope', y='Class')
box.show();
  • S_slope in class as 'type_s', contain 7 outliers, 5 at right whiskers side and 2 at left whiskers side.
  • S_slope in class as 'type_h', contain no outlier.
  • S_slope in class as 'normal', contain 1 outlier, 1 at right whiskers side.
In [39]:
box=px.box(patients,x='P_radius', y='Class')
box.show();
  • P_radius in class as 'type_s', contain 4 outliers, 3 at right whiskers side and 1 at left whiskers side.
  • P_radius in class as 'type_h', contain 1 outlier at left whiskers side.
  • P_radius in class as 'normal', contain 2 outliers, 1 at right whiskers side and 1 at left whiskers side.
In [40]:
box=px.box(patients,x='S_Degree', y='Class')
box.show();
  • S_Degree in class as 'type_s', contain 7 outliers at right whiskers side.
  • S_Degree in class as 'type_h', contain no outlier.
  • S_Degree in class as 'normal', contain 3 outliers at right whiskers side.

4. Model Building¶

Converting 'Class' to numerical values.

In [41]:
label_encoder=LabelEncoder()
patients.Class=label_encoder.fit_transform(patients.Class)
patients.Class.value_counts()
Out[41]:
2    150
0    100
1     60
Name: Class, dtype: int64
In [42]:
patients.Class=patients['Class'].astype('category')
patients.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 310 entries, 0 to 309
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   P_incidence  310 non-null    float64 
 1   P_tilt       310 non-null    float64 
 2   L_angle      310 non-null    float64 
 3   S_slope      310 non-null    float64 
 4   P_radius     310 non-null    float64 
 5   S_Degree     310 non-null    float64 
 6   Class        310 non-null    category
dtypes: category(1), float64(6)
memory usage: 15.1 KB

A. Split data into X and Y.¶

In [43]:
# Independent Variables
X_outlier1_patient=patients.drop(columns='Class', axis=1)

# Target Variables
y_outlier1_patient=patients.Class
In [44]:
# Standardizing Independent Variables i.e. X
X_outlier1_patient_Scaled=X_outlier1_patient.apply(stats.zscore)
X_outlier1_patient_Scaled.describe()
Out[44]:
P_incidence P_tilt L_angle S_slope P_radius S_Degree
count 3.100000e+02 3.100000e+02 3.100000e+02 3.100000e+02 3.100000e+02 3.100000e+02
mean 1.042177e-16 2.096889e-16 1.980495e-16 2.829278e-17 -5.071212e-16 -1.277204e-16
std 1.001617e+00 1.001617e+00 1.001617e+00 1.001617e+00 1.001617e+00 1.001617e+00
min -1.996010e+00 -2.411664e+00 -2.047652e+00 -2.207741e+00 -3.597963e+00 -9.961725e-01
25% -8.173982e-01 -6.881138e-01 -8.060267e-01 -7.168418e-01 -5.423830e-01 -6.585073e-01
50% -1.049246e-01 -1.186061e-01 -1.278621e-01 -4.095971e-02 2.613767e-02 -3.874502e-01
75% 7.194643e-01 4.581158e-01 5.975493e-01 7.269414e-01 5.676209e-01 3.997679e-01
max 4.029206e+00 3.191402e+00 3.984615e+00 5.855771e+00 3.395818e+00 1.046035e+01

Independent variables are standardized using z-score, and now can be used for futher analysis.

B. Split data into train and test with 80:20 proportion.¶

In [45]:
X_outlier1_patient_train, X_outlier1_patient_test, y_outlier1_patient_train, y_outlier1_patient_test = train_test_split(X_outlier1_patient_Scaled, y_outlier1_patient, test_size=0.20, random_state=42)
In [46]:
X_outlier1_patient_train.shape, X_outlier1_patient_test.shape
Out[46]:
((248, 6), (62, 6))

C. Train a Supervised Learning Classification base model using KNN classifier.¶

In [47]:
NNH=KNeighborsClassifier(n_neighbors=int(np.sqrt(len(patients))), metric='euclidean')
In [48]:
NNH.fit(X_outlier1_patient_train, y_outlier1_patient_train)
y_outlier1_patient_predicted = NNH.predict(X_outlier1_patient_test)
print('Accuracy on Training data:',NNH.score(X_outlier1_patient_train, y_outlier1_patient_train))
print('Accuracy on Test data:',NNH.score(X_outlier1_patient_test, y_outlier1_patient_test))
Accuracy on Training data: 0.7782258064516129
Accuracy on Test data: 0.8387096774193549

Training Accuracy is 78% and Testing Accuracy is 84% with n_neighbors as 17.

D. Print all the possible performance metrics for both train and test data.¶

In [49]:
cm = confusion_matrix(y_outlier1_patient_test, y_outlier1_patient_predicted, labels=[0, 1, 2])

df_cm = pd.DataFrame(cm, index = [i for i in ["Normal","Type_H","Type_S"]],
                  columns = [i for i in ["Normal","Type_H","Type_S"]])
plt.figure(figsize=(10,7))
sns.heatmap(df_cm, annot=True, cmap='YlGnBu', lw=1)
plt.yticks(fontsize=12, weight='bold')
plt.xticks(fontsize=12, weight='bold')
plt.show();
In [50]:
print("Classification Matrix:\n",classification_report(y_outlier1_patient_test,y_outlier1_patient_predicted))
Classification Matrix:
               precision    recall  f1-score   support

           0       0.78      0.82      0.80        22
           1       0.64      0.70      0.67        10
           2       0.96      0.90      0.93        30

    accuracy                           0.84        62
   macro avg       0.79      0.81      0.80        62
weighted avg       0.85      0.84      0.84        62

  • Accuracy 84%
  • By Precision,
    • class 0 is predicted 78% correctly.
    • class 1 is predicted 64% correctly.
    • class 2 is predicted 96% correctly.
  • F1-Score is highest for class 2 i.e. 93%

5. Performance Improvement¶

A. Experiment with various parameters to improve performance of the base model.¶

In [51]:
for k in range(1,20,2):
    NNH = KNeighborsClassifier(n_neighbors= k, metric='euclidean')
    NNH.fit(X_outlier1_patient_train, y_outlier1_patient_train)
    y_outlier2_patient_predicted = NNH.predict(X_outlier1_patient_test)
    print('K:',k)
    print("Accuracy on Training data:",NNH.score(X_outlier1_patient_train, y_outlier1_patient_train))
    print("Accuracy on Testing data:",NNH.score(X_outlier1_patient_test, y_outlier1_patient_test))
    print("Classification Matrix:\n",classification_report(y_outlier1_patient_test,y_outlier2_patient_predicted))
K: 1
Accuracy on Training data: 1.0
Accuracy on Testing data: 0.7903225806451613
Classification Matrix:
               precision    recall  f1-score   support

           0       0.71      0.77      0.74        22
           1       0.64      0.70      0.67        10
           2       0.93      0.83      0.88        30

    accuracy                           0.79        62
   macro avg       0.76      0.77      0.76        62
weighted avg       0.80      0.79      0.79        62

K: 3
Accuracy on Training data: 0.9153225806451613
Accuracy on Testing data: 0.8064516129032258
Classification Matrix:
               precision    recall  f1-score   support

           0       0.72      0.82      0.77        22
           1       0.60      0.60      0.60        10
           2       0.96      0.87      0.91        30

    accuracy                           0.81        62
   macro avg       0.76      0.76      0.76        62
weighted avg       0.82      0.81      0.81        62

K: 5
Accuracy on Training data: 0.8387096774193549
Accuracy on Testing data: 0.8387096774193549
Classification Matrix:
               precision    recall  f1-score   support

           0       0.79      0.86      0.83        22
           1       0.70      0.70      0.70        10
           2       0.93      0.87      0.90        30

    accuracy                           0.84        62
   macro avg       0.81      0.81      0.81        62
weighted avg       0.84      0.84      0.84        62

K: 7
Accuracy on Training data: 0.8387096774193549
Accuracy on Testing data: 0.8387096774193549
Classification Matrix:
               precision    recall  f1-score   support

           0       0.79      0.86      0.83        22
           1       0.67      0.60      0.63        10
           2       0.93      0.90      0.92        30

    accuracy                           0.84        62
   macro avg       0.80      0.79      0.79        62
weighted avg       0.84      0.84      0.84        62

K: 9
Accuracy on Training data: 0.8225806451612904
Accuracy on Testing data: 0.8387096774193549
Classification Matrix:
               precision    recall  f1-score   support

           0       0.76      0.86      0.81        22
           1       0.70      0.70      0.70        10
           2       0.96      0.87      0.91        30

    accuracy                           0.84        62
   macro avg       0.81      0.81      0.81        62
weighted avg       0.85      0.84      0.84        62

K: 11
Accuracy on Training data: 0.8145161290322581
Accuracy on Testing data: 0.8225806451612904
Classification Matrix:
               precision    recall  f1-score   support

           0       0.72      0.82      0.77        22
           1       0.78      0.70      0.74        10
           2       0.93      0.87      0.90        30

    accuracy                           0.82        62
   macro avg       0.81      0.79      0.80        62
weighted avg       0.83      0.82      0.82        62

K: 13
Accuracy on Training data: 0.8104838709677419
Accuracy on Testing data: 0.8548387096774194
Classification Matrix:
               precision    recall  f1-score   support

           0       0.79      0.86      0.83        22
           1       0.73      0.80      0.76        10
           2       0.96      0.87      0.91        30

    accuracy                           0.85        62
   macro avg       0.83      0.84      0.83        62
weighted avg       0.86      0.85      0.86        62

K: 15
Accuracy on Training data: 0.7862903225806451
Accuracy on Testing data: 0.8548387096774194
Classification Matrix:
               precision    recall  f1-score   support

           0       0.82      0.82      0.82        22
           1       0.64      0.70      0.67        10
           2       0.97      0.93      0.95        30

    accuracy                           0.85        62
   macro avg       0.81      0.82      0.81        62
weighted avg       0.86      0.85      0.86        62

K: 17
Accuracy on Training data: 0.7782258064516129
Accuracy on Testing data: 0.8387096774193549
Classification Matrix:
               precision    recall  f1-score   support

           0       0.78      0.82      0.80        22
           1       0.64      0.70      0.67        10
           2       0.96      0.90      0.93        30

    accuracy                           0.84        62
   macro avg       0.79      0.81      0.80        62
weighted avg       0.85      0.84      0.84        62

K: 19
Accuracy on Training data: 0.7903225806451613
Accuracy on Testing data: 0.8548387096774194
Classification Matrix:
               precision    recall  f1-score   support

           0       0.79      0.86      0.83        22
           1       0.70      0.70      0.70        10
           2       0.96      0.90      0.93        30

    accuracy                           0.85        62
   macro avg       0.82      0.82      0.82        62
weighted avg       0.86      0.85      0.86        62

If we don't remove the outliers, k=13, makes improvement in Accuracy as 81%(training), 85%(Testing) and precision as 79%(class0), 73%(class1) and 96%(class2).

Let remove possible outliers and re-run the model to check the report.

In [52]:
patients_with_outliers=patients.copy(deep=True)
columns=patients.columns.drop('Class')
for col in columns:
   q1,q3=np.quantile(patients[col],0.25),np.quantile(patients[col],0.75)
   threshold=(q3-q1)*1.5
   lower,upper=q1-threshold,q3+threshold

   median=patients[col].median()

   patients[col]=np.where(patients[col]>upper,median,patients[col])
   patients[col]=np.where(patients[col]<lower,median,patients[col])
In [53]:
# some outliers will be removed
box=px.box(patients.drop(columns=['Class']), orientation='h')
box.show();
In [54]:
# Independent Variables
X_wo_outlier1_patient=patients.drop(columns='Class')

# Target Variables
y_wo_outlier1_patient=patients.Class

X_wo_outlier1_patient_train, X_wo_outlier1_patient_test, y_wo_outlier1_patient_train, y_wo_outlier1_patient_test = train_test_split(X_wo_outlier1_patient.apply(stats.zscore), y_wo_outlier1_patient, test_size=0.20, random_state=42)

NNH=KNeighborsClassifier(n_neighbors= int(np.sqrt(len(patients))), metric='euclidean')

NNH.fit(X_wo_outlier1_patient_train, y_wo_outlier1_patient_train)
y_wo_outlier1_patient_predicted = NNH.predict(X_wo_outlier1_patient_test)
print('Accuracy on Training data:',NNH.score(X_wo_outlier1_patient_train, y_wo_outlier1_patient_train))
print('Accuracy on Test data:',NNH.score(X_wo_outlier1_patient_test, y_wo_outlier1_patient_test))

print("Classification Matrix:\n",classification_report(y_wo_outlier1_patient_test,y_wo_outlier1_patient_predicted))
Accuracy on Training data: 0.8024193548387096
Accuracy on Test data: 0.8387096774193549
Classification Matrix:
               precision    recall  f1-score   support

           0       0.76      0.86      0.81        22
           1       0.70      0.70      0.70        10
           2       0.96      0.87      0.91        30

    accuracy                           0.84        62
   macro avg       0.81      0.81      0.81        62
weighted avg       0.85      0.84      0.84        62

After removing outliers, Training Accuracy is 80% and Testing Accuracy as 84% with n_neighbors as 17.

In [55]:
train_score=[]
test_score=[]
for k in range(1,20,2):
    NNH = KNeighborsClassifier(n_neighbors= k, metric='euclidean')
    NNH.fit(X_wo_outlier1_patient_train, y_wo_outlier1_patient_train)
    y_wo_outlier2_patient_predicted = NNH.predict(X_wo_outlier1_patient_test)
    print('K:',k)
    print("Accuracy on Training data:",NNH.score(X_wo_outlier1_patient_train, y_wo_outlier1_patient_train))
    print("Accuracy on Testing data:",NNH.score(X_wo_outlier1_patient_test, y_wo_outlier1_patient_test))
    print("Classification Matrix:\n",classification_report(y_wo_outlier1_patient_test,y_wo_outlier2_patient_predicted))
K: 1
Accuracy on Training data: 1.0
Accuracy on Testing data: 0.8709677419354839
Classification Matrix:
               precision    recall  f1-score   support

           0       0.83      0.86      0.84        22
           1       0.70      0.70      0.70        10
           2       0.97      0.93      0.95        30

    accuracy                           0.87        62
   macro avg       0.83      0.83      0.83        62
weighted avg       0.87      0.87      0.87        62

K: 3
Accuracy on Training data: 0.8911290322580645
Accuracy on Testing data: 0.8064516129032258
Classification Matrix:
               precision    recall  f1-score   support

           0       0.76      0.73      0.74        22
           1       0.54      0.70      0.61        10
           2       0.96      0.90      0.93        30

    accuracy                           0.81        62
   macro avg       0.75      0.78      0.76        62
weighted avg       0.82      0.81      0.81        62

K: 5
Accuracy on Training data: 0.8548387096774194
Accuracy on Testing data: 0.8387096774193549
Classification Matrix:
               precision    recall  f1-score   support

           0       0.81      0.77      0.79        22
           1       0.64      0.70      0.67        10
           2       0.93      0.93      0.93        30

    accuracy                           0.84        62
   macro avg       0.79      0.80      0.80        62
weighted avg       0.84      0.84      0.84        62

K: 7
Accuracy on Training data: 0.8346774193548387
Accuracy on Testing data: 0.8225806451612904
Classification Matrix:
               precision    recall  f1-score   support

           0       0.77      0.77      0.77        22
           1       0.58      0.70      0.64        10
           2       0.96      0.90      0.93        30

    accuracy                           0.82        62
   macro avg       0.77      0.79      0.78        62
weighted avg       0.83      0.82      0.83        62

K: 9
Accuracy on Training data: 0.8387096774193549
Accuracy on Testing data: 0.8387096774193549
Classification Matrix:
               precision    recall  f1-score   support

           0       0.76      0.86      0.81        22
           1       0.67      0.60      0.63        10
           2       0.96      0.90      0.93        30

    accuracy                           0.84        62
   macro avg       0.80      0.79      0.79        62
weighted avg       0.84      0.84      0.84        62

K: 11
Accuracy on Training data: 0.8185483870967742
Accuracy on Testing data: 0.8387096774193549
Classification Matrix:
               precision    recall  f1-score   support

           0       0.78      0.82      0.80        22
           1       0.64      0.70      0.67        10
           2       0.96      0.90      0.93        30

    accuracy                           0.84        62
   macro avg       0.79      0.81      0.80        62
weighted avg       0.85      0.84      0.84        62

K: 13
Accuracy on Training data: 0.8064516129032258
Accuracy on Testing data: 0.8548387096774194
Classification Matrix:
               precision    recall  f1-score   support

           0       0.83      0.86      0.84        22
           1       0.67      0.80      0.73        10
           2       0.96      0.87      0.91        30

    accuracy                           0.85        62
   macro avg       0.82      0.84      0.83        62
weighted avg       0.87      0.85      0.86        62

K: 15
Accuracy on Training data: 0.7983870967741935
Accuracy on Testing data: 0.8225806451612904
Classification Matrix:
               precision    recall  f1-score   support

           0       0.76      0.86      0.81        22
           1       0.60      0.60      0.60        10
           2       0.96      0.87      0.91        30

    accuracy                           0.82        62
   macro avg       0.77      0.78      0.77        62
weighted avg       0.83      0.82      0.83        62

K: 17
Accuracy on Training data: 0.8024193548387096
Accuracy on Testing data: 0.8387096774193549
Classification Matrix:
               precision    recall  f1-score   support

           0       0.76      0.86      0.81        22
           1       0.70      0.70      0.70        10
           2       0.96      0.87      0.91        30

    accuracy                           0.84        62
   macro avg       0.81      0.81      0.81        62
weighted avg       0.85      0.84      0.84        62

K: 19
Accuracy on Training data: 0.8024193548387096
Accuracy on Testing data: 0.8387096774193549
Classification Matrix:
               precision    recall  f1-score   support

           0       0.79      0.86      0.83        22
           1       0.64      0.70      0.67        10
           2       0.96      0.87      0.91        30

    accuracy                           0.84        62
   macro avg       0.80      0.81      0.80        62
weighted avg       0.85      0.84      0.84        62

After removing the outliers, k=13, makes improvement in Accuracy as 81%(training), 85%(Testing) and precision as 83%(class0), 67%(class1) and 96%(class2).

C. Clearly state which parameters contributed most to improve model performance.¶

Used different types of method to calculate the n_neighbors, with and without outliers, where outliers weren't that much so have modelled with both perspectives. n_neighbors(K)=13, has given best result, for both with and without outliers. n_neighbors(K) parameter here is significant to improve the performance with the me

#¶

PART-B¶

#¶

1. Data Understanding and Preparation¶

A. Read both the Datasets ‘Data1’ and ‘Data 2’ as DataFrame and store them into two separate variables.¶

In [56]:
# Reading Data1.csv
data1=pd.read_csv('Data1.csv')
data1.head()
Out[56]:
ID Age CustomerSince HighestSpend ZipCode HiddenScore MonthlyAverageSpend Level
0 1 25 1 49 91107 4 1.6 1
1 2 45 19 34 90089 3 1.5 1
2 3 39 15 11 94720 1 1.0 1
3 4 35 9 100 94112 1 2.7 2
4 5 35 8 45 91330 4 1.0 2
In [57]:
# Reading Data2.csv
data2=pd.read_csv('Data2.csv')
data2.head()
Out[57]:
ID Mortgage Security FixedDepositAccount InternetBanking CreditCard LoanOnCard
0 1 0 1 0 0 0 NaN
1 2 0 1 0 0 0 NaN
2 3 0 0 0 0 0 NaN
3 4 0 0 0 0 0 NaN
4 5 0 0 0 0 1 NaN

B. Print shape and Column Names and DataTypes of both the Dataframes.¶

In [58]:
print("Shape of data1 dataset", data1.shape)
data1.info()
Shape of data1 dataset (5000, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   5000 non-null   int64  
 1   Age                  5000 non-null   int64  
 2   CustomerSince        5000 non-null   int64  
 3   HighestSpend         5000 non-null   int64  
 4   ZipCode              5000 non-null   int64  
 5   HiddenScore          5000 non-null   int64  
 6   MonthlyAverageSpend  5000 non-null   float64
 7   Level                5000 non-null   int64  
dtypes: float64(1), int64(7)
memory usage: 312.6 KB

Data1 dataset contains 5000 data with 8 columns.

In [59]:
print("Shape of data2 dataset", data2.shape)
data2.info()
Shape of data2 dataset (5000, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   5000 non-null   int64  
 1   Mortgage             5000 non-null   int64  
 2   Security             5000 non-null   int64  
 3   FixedDepositAccount  5000 non-null   int64  
 4   InternetBanking      5000 non-null   int64  
 5   CreditCard           5000 non-null   int64  
 6   LoanOnCard           4980 non-null   float64
dtypes: float64(1), int64(6)
memory usage: 273.6 KB

Data2 dataset contains 5000 data with 7 columns.

C. Merge both the Dataframes on ‘ID’ feature to form a single DataFrame.¶

In [60]:
cust_data=pd.merge(data1,data2,on='ID')
cust_data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   5000 non-null   int64  
 1   Age                  5000 non-null   int64  
 2   CustomerSince        5000 non-null   int64  
 3   HighestSpend         5000 non-null   int64  
 4   ZipCode              5000 non-null   int64  
 5   HiddenScore          5000 non-null   int64  
 6   MonthlyAverageSpend  5000 non-null   float64
 7   Level                5000 non-null   int64  
 8   Mortgage             5000 non-null   int64  
 9   Security             5000 non-null   int64  
 10  FixedDepositAccount  5000 non-null   int64  
 11  InternetBanking      5000 non-null   int64  
 12  CreditCard           5000 non-null   int64  
 13  LoanOnCard           4980 non-null   float64
dtypes: float64(2), int64(12)
memory usage: 585.9 KB

Cust_data dataset contains 5000 data with 14 columns

D. Change Datatype of below features to ‘Object’.¶

In [61]:
col=['CreditCard','InternetBanking','FixedDepositAccount','Security','Level','HiddenScore']
for i in col:
    cust_data[i]=cust_data[i].astype('object')
cust_data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   5000 non-null   int64  
 1   Age                  5000 non-null   int64  
 2   CustomerSince        5000 non-null   int64  
 3   HighestSpend         5000 non-null   int64  
 4   ZipCode              5000 non-null   int64  
 5   HiddenScore          5000 non-null   object 
 6   MonthlyAverageSpend  5000 non-null   float64
 7   Level                5000 non-null   object 
 8   Mortgage             5000 non-null   int64  
 9   Security             5000 non-null   object 
 10  FixedDepositAccount  5000 non-null   object 
 11  InternetBanking      5000 non-null   object 
 12  CreditCard           5000 non-null   object 
 13  LoanOnCard           4980 non-null   float64
dtypes: float64(2), int64(6), object(6)
memory usage: 585.9+ KB

As these features contains binary values, i.e. 1&0, in future analysis we can convert same to category type also.

2. Data Exploration and Analysis¶

A. Visualize distribution of Target variable ‘LoanOnCard’ and clearly share insights.¶

B. Check the percentage of missing values and impute if required.¶

C. Check for unexpected values in each categorical variable and impute with best suitable value.¶

In [62]:
plt.figure(figsize=(10,7))
sns.countplot(data=cust_data,x='LoanOnCard',order=[0,1])
plt.xlabel(xlabel='LoanOnCard', fontsize=12, weight='bold')
plt.ylabel(ylabel='count', fontsize=12, weight='bold')
plt.title(label='LoanOnCard Target Variable Representation', fontsize=18, weight='bold')
plt.show();
In [63]:
cust_data.LoanOnCard.value_counts()
Out[63]:
0.0    4500
1.0     480
Name: LoanOnCard, dtype: int64
In [64]:
cust_data.LoanOnCard.isnull().sum()
Out[64]:
20

As we can see, our dataset contains missing values as total entries are 5000, and here classification is done for 4980(4500(0 class) & 480(1 class)) entries and 20 as missing values. So need to do some data cleansing. As LoanOnCard is target value, we can convert it to category dtype.

In [65]:
# To check the missing data percentage
cust_data.isnull().mean() * 100
Out[65]:
ID                     0.0
Age                    0.0
CustomerSince          0.0
HighestSpend           0.0
ZipCode                0.0
HiddenScore            0.0
MonthlyAverageSpend    0.0
Level                  0.0
Mortgage               0.0
Security               0.0
FixedDepositAccount    0.0
InternetBanking        0.0
CreditCard             0.0
LoanOnCard             0.4
dtype: float64
In [66]:
(cust_data.isnull().sum() / cust_data.shape[0]) * 100
Out[66]:
ID                     0.0
Age                    0.0
CustomerSince          0.0
HighestSpend           0.0
ZipCode                0.0
HiddenScore            0.0
MonthlyAverageSpend    0.0
Level                  0.0
Mortgage               0.0
Security               0.0
FixedDepositAccount    0.0
InternetBanking        0.0
CreditCard             0.0
LoanOnCard             0.4
dtype: float64

0.4% of data is missing for 'LoanOnCard' feature, which is about 20 entries. Let's remove these missing entries.

In [67]:
cust_data.dropna(axis=0,inplace=True)
cust_data.isnull().sum()
Out[67]:
ID                     0
Age                    0
CustomerSince          0
HighestSpend           0
ZipCode                0
HiddenScore            0
MonthlyAverageSpend    0
Level                  0
Mortgage               0
Security               0
FixedDepositAccount    0
InternetBanking        0
CreditCard             0
LoanOnCard             0
dtype: int64
In [68]:
cust_data.LoanOnCard=cust_data.LoanOnCard.astype('int')
cust_data.LoanOnCard=cust_data.LoanOnCard.astype('category')
cust_data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4980 entries, 9 to 4999
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   ID                   4980 non-null   int64   
 1   Age                  4980 non-null   int64   
 2   CustomerSince        4980 non-null   int64   
 3   HighestSpend         4980 non-null   int64   
 4   ZipCode              4980 non-null   int64   
 5   HiddenScore          4980 non-null   object  
 6   MonthlyAverageSpend  4980 non-null   float64 
 7   Level                4980 non-null   object  
 8   Mortgage             4980 non-null   int64   
 9   Security             4980 non-null   object  
 10  FixedDepositAccount  4980 non-null   object  
 11  InternetBanking      4980 non-null   object  
 12  CreditCard           4980 non-null   object  
 13  LoanOnCard           4980 non-null   category
dtypes: category(1), float64(1), int64(6), object(6)
memory usage: 549.7+ KB

As missing data has been removed and LoanOnCard dtype updated to category type, let's re-visualize distribution of Target variable ‘LoanOnCard’.

In [69]:
plt.figure(figsize=(10,7))
sns.countplot(data=cust_data,x='LoanOnCard')
plt.xlabel(xlabel='LoanOnCard', fontsize=12, weight='bold')
plt.ylabel(ylabel='count', fontsize=12, weight='bold')
plt.title(label='LoanOnCard Target Variable Representation', fontsize=18, weight='bold')
plt.show();
In [70]:
fig = px.pie(cust_data, hole=0.3, values=cust_data.LoanOnCard.value_counts(), names=['Loan on Card' if i else 'No Loan on Card' for i in cust_data.LoanOnCard.value_counts().index], color_discrete_sequence=px.colors.sequential.turbid_r, title='LoanOnCard Representation')
fig.show();

90.4%(4500) people don't have loan on card and 9.64%(480) people have existing loan on card.

In [71]:
col=['CreditCard','InternetBanking','FixedDepositAccount','Security','Level','HiddenScore']
for i in col:
    cust_data[i]=cust_data[i].astype('category')
cust_data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4980 entries, 9 to 4999
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   ID                   4980 non-null   int64   
 1   Age                  4980 non-null   int64   
 2   CustomerSince        4980 non-null   int64   
 3   HighestSpend         4980 non-null   int64   
 4   ZipCode              4980 non-null   int64   
 5   HiddenScore          4980 non-null   category
 6   MonthlyAverageSpend  4980 non-null   float64 
 7   Level                4980 non-null   category
 8   Mortgage             4980 non-null   int64   
 9   Security             4980 non-null   category
 10  FixedDepositAccount  4980 non-null   category
 11  InternetBanking      4980 non-null   category
 12  CreditCard           4980 non-null   category
 13  LoanOnCard           4980 non-null   category
dtypes: category(7), float64(1), int64(6)
memory usage: 346.2 KB

Analyzing categorical variables.

In [72]:
cust_data.HiddenScore.value_counts(),cust_data.HiddenScore.value_counts().sum()
Out[72]:
(1    1466
 2    1293
 4    1215
 3    1006
 Name: HiddenScore, dtype: int64,
 4980)
In [73]:
fig = px.pie(cust_data, hole=0.3, values=cust_data.HiddenScore.value_counts(), names=cust_data.HiddenScore.value_counts().index, color_discrete_sequence=px.colors.sequential.turbid_r, title='HiddenScore Representation')
fig.show();

Distribution of data in HiddenScore has no unexpected values, so no need of imputation.

In [74]:
cust_data.Level.value_counts(),cust_data.Level.value_counts().sum()
Out[74]:
(1    2089
 3    1496
 2    1395
 Name: Level, dtype: int64,
 4980)
In [75]:
fig = px.pie(cust_data, hole=0.3, values=cust_data.Level.value_counts(), names=cust_data.Level.value_counts().index, color_discrete_sequence=px.colors.sequential.turbid_r, title='Level Representation')
fig.show();

Distribution of data in Level has no unexpected values, so no need of imputation.

In [76]:
cust_data.Security.value_counts(),cust_data.Security.value_counts().sum()
Out[76]:
(0    4460
 1     520
 Name: Security, dtype: int64,
 4980)
In [77]:
fig = px.pie(cust_data, hole=0.3, values=cust_data.Security.value_counts(), names=cust_data.Security.value_counts().index, color_discrete_sequence=px.colors.sequential.turbid_r, title='Security Representation')
fig.show();

Distribution of data in Security has no unexpected values, so no need of imputation.

In [78]:
cust_data.FixedDepositAccount.value_counts(),cust_data.FixedDepositAccount.value_counts().sum()
Out[78]:
(0    4678
 1     302
 Name: FixedDepositAccount, dtype: int64,
 4980)
In [79]:
fig = px.pie(cust_data, hole=0.3, values=cust_data.FixedDepositAccount.value_counts(), names=cust_data.FixedDepositAccount.value_counts().index, color_discrete_sequence=px.colors.sequential.turbid_r, title='FixedDepositAccount Representation')
fig.show();

Distribution of data in FixedDepositAccount has no unexpected values, so no need of imputation.

In [80]:
cust_data.InternetBanking.value_counts(),cust_data.InternetBanking.value_counts().sum()
Out[80]:
(1    2974
 0    2006
 Name: InternetBanking, dtype: int64,
 4980)
In [81]:
fig = px.pie(cust_data, hole=0.3, values=cust_data.InternetBanking.value_counts(), names=cust_data.InternetBanking.value_counts().index, color_discrete_sequence=px.colors.sequential.turbid_r, title='InternetBanking Representation')
fig.show();

Distribution of data in InternetBanking has no unexpected values, so no need of imputation.

In [82]:
cust_data.CreditCard.value_counts(),cust_data.CreditCard.value_counts().sum()
Out[82]:
(0    3514
 1    1466
 Name: CreditCard, dtype: int64,
 4980)
In [83]:
fig = px.pie(cust_data, hole=0.3, values=cust_data.CreditCard.value_counts(), names=cust_data.CreditCard.value_counts().index, color_discrete_sequence=px.colors.sequential.turbid_r, title='CreditCard Representation')
fig.show();

Distribution of data in CreditCard has no unexpected values, so no need of imputation.

3. Data Preparation and model building¶

In [84]:
# Calculating correlation of independent variables with the target variable.

cust_data_dup=cust_data.copy(deep=True)
cust_data_dup.HiddenScore=cust_data_dup.HiddenScore.astype('int64')
cust_data_dup.Level=cust_data_dup.Level.astype('int64')
cust_data_dup.Level=cust_data_dup.Level.astype('int64')
cust_data_dup.Security=cust_data_dup.Security.astype('int64')
cust_data_dup.FixedDepositAccount=cust_data_dup.FixedDepositAccount.astype('int64')
cust_data_dup.CreditCard=cust_data_dup.CreditCard.astype('int64')
cust_data_dup.InternetBanking=cust_data_dup.InternetBanking.astype('int64')
cust_data_dup.LoanOnCard=cust_data_dup.LoanOnCard.astype('int64')
cust_data_dup.drop(columns=['ID','ZipCode'], inplace=True, axis=1)
cust_data_dup.corr()
Out[84]:
Age CustomerSince HighestSpend HiddenScore MonthlyAverageSpend Level Mortgage Security FixedDepositAccount InternetBanking CreditCard LoanOnCard
Age 1.000000 0.994208 -0.054951 -0.045289 -0.051896 0.042750 -0.013272 0.000323 0.007744 0.011227 0.007344 -0.008147
CustomerSince 0.994208 1.000000 -0.046092 -0.051456 -0.049918 0.014545 -0.011380 -0.000469 0.010085 0.011355 0.008779 -0.007801
HighestSpend -0.054951 -0.046092 1.000000 -0.158357 0.646109 -0.188909 0.207236 -0.002284 0.169535 0.014202 -0.002780 0.502626
HiddenScore -0.045289 -0.051456 -0.158357 1.000000 -0.109180 0.065762 -0.021396 0.019061 0.014327 0.010900 0.010784 0.061761
MonthlyAverageSpend -0.051896 -0.049918 0.646109 -0.109180 1.000000 -0.137020 0.110275 0.015105 0.136410 -0.003475 -0.006577 0.366912
Level 0.042750 0.014545 -0.188909 0.065762 -0.137020 1.000000 -0.032863 -0.009443 0.013982 -0.014556 -0.011766 0.137010
Mortgage -0.013272 -0.011380 0.207236 -0.021396 0.110275 -0.032863 1.000000 -0.005002 0.089167 -0.007044 -0.007600 0.141947
Security 0.000323 -0.000469 -0.002284 0.019061 0.015105 -0.009443 -0.005002 1.000000 0.317673 0.014007 -0.014518 0.021982
FixedDepositAccount 0.007744 0.010085 0.169535 0.014327 0.136410 0.013982 0.089167 0.317673 1.000000 0.176082 0.278924 0.316131
InternetBanking 0.011227 0.011355 0.014202 0.010900 -0.003475 -0.014556 -0.007044 0.014007 0.176082 1.000000 0.004960 0.006034
CreditCard 0.007344 0.008779 -0.002780 0.010784 -0.006577 -0.011766 -0.007600 -0.014518 0.278924 0.004960 1.000000 0.002536
LoanOnCard -0.008147 -0.007801 0.502626 0.061761 0.366912 0.137010 0.141947 0.021982 0.316131 0.006034 0.002536 1.000000

Analyzing above correlation with LoanOnCard with other features, its interpreted that we can remove Age, CustomerSince, Security, InternetBanking, CreditCard, as these feature don't have any relation with LoanOnCard.

A. Split data into X and Y.¶

In [85]:
# Independent Variables
X=cust_data.drop(columns=['ID','ZipCode', 'LoanOnCard', 'Age', 'CustomerSince', 'Security', 'InternetBanking', 'CreditCard'], axis=1)

# Target Variables
y=cust_data.LoanOnCard

B. Split data into train and test. Keep 25% data reserved for testing.¶

In [86]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
In [87]:
X_train.shape, X_test.shape
Out[87]:
((3735, 6), (1245, 6))

C. Train a Supervised Learning Classification base model - Logistic Regression.¶

D. Print evaluation metrics for the model and clearly share insights.¶

In [88]:
model=LogisticRegression()
model.fit(X_train, y_train)
y_predict=model.predict(X_test)
model.coef_, model.intercept_
Out[88]:
(array([[5.23258503e-02, 6.28135677e-01, 1.24264105e-01, 1.68758188e+00,
         1.18143098e-03, 2.45440902e+00]]),
 array([-13.47246279]))
In [89]:
print("Accuracy score Training dataset:{:.2f}".format(model.score(X_train, y_train)))
print("Accuracy score Testing dataset:{:.2f}".format(model.score(X_test, y_test)))
Accuracy score Training dataset:0.95
Accuracy score Testing dataset:0.94
In [90]:
print("Log loss:{:.2f}".format(log_loss(y_test, y_predict)))
sns.heatmap(confusion_matrix(y_test, y_predict), annot=True, fmt='.2f')
print(classification_report(y_test,y_predict))
Log loss:2.00
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1125
           1       0.77      0.57      0.65       120

    accuracy                           0.94      1245
   macro avg       0.86      0.77      0.81      1245
weighted avg       0.94      0.94      0.94      1245

  • Accuracy 94%
  • By Precision,
    • class 0 is predicted 96% correctly.
    • class 1 is predicted 77% correctly.
  • F1-Score is highest for class 0 i.e. 97%

E. Balance the data using the right balancing technique.¶

In [91]:
cust_data.LoanOnCard.value_counts()
Out[91]:
0    4500
1     480
Name: LoanOnCard, dtype: int64

Imbalance distribution. As having LoanOnCard is 480, let's take balanced distribution as 480 for no LoanOnCard.

In [92]:
cust_data_balanced=cust_data[cust_data.LoanOnCard==0].sample(n=480, random_state=42).append(cust_data[cust_data.LoanOnCard==1])
cust_data_balanced.reset_index(drop=True, inplace=True)
cust_data_balanced.LoanOnCard.value_counts()
Out[92]:
0    480
1    480
Name: LoanOnCard, dtype: int64

Data uniformly distributed w.r.t target variable.

F. Again train the same previous model on balanced data.¶

G. Print evaluation metrics and clearly share differences observed.¶

In [93]:
# Independent Variables
X_balanced = cust_data_balanced.drop(columns=['ID','ZipCode', 'LoanOnCard', 'Age', 'CustomerSince', 'Security', 'InternetBanking', 'CreditCard'], axis=1)

# Target Variables
y_balanced = cust_data_balanced.LoanOnCard

X_balanced_train, X_balanced_test, y_balanced_train, y_balanced_test = train_test_split(X_balanced, y_balanced, test_size=0.25, random_state=42, stratify=y_balanced)
X_balanced_train.shape, X_balanced_test.shape
Out[93]:
((720, 6), (240, 6))
In [94]:
model=LogisticRegression()
model.fit(X_balanced_train, y_balanced_train)
y_balanced_predict=model.predict(X_balanced_test)

print("Accuracy score Training dataset:{:.2f}".format(model.score(X_balanced_train, y_balanced_train)))
print("Accuracy score Testing dataset:{:.2f}".format(model.score(X_balanced_test, y_balanced_test)))

print("Log loss:{:.2f}".format(log_loss(y_balanced_test, y_balanced_predict)))
sns.heatmap(confusion_matrix(y_balanced_test, y_balanced_predict), annot=True, fmt='.2f')
print(classification_report(y_balanced_test,y_balanced_predict))
Accuracy score Training dataset:0.90
Accuracy score Testing dataset:0.92
Log loss:2.88
              precision    recall  f1-score   support

           0       0.89      0.95      0.92       120
           1       0.95      0.88      0.91       120

    accuracy                           0.92       240
   macro avg       0.92      0.92      0.92       240
weighted avg       0.92      0.92      0.92       240

Model has been trained for 2 case:

  • When data is not equally balanced, accuracy was high i.e. 94%, but can't consider for our further predictions.
  • When data is equally balanced, accuracy went minutely low from previous i.e. 92%, but can be considered for further prediction as LoanOnCard is equally divided.

Here we would be checking precision as we want more truePositive(TP), so:

  • Unbalanced data precision:
    • class 0 is predicted 96% correctly for all no LoanOnCard customer.
    • class 1 is predicted 77% correctly for all LoanOnCard customer.
  • Balanced data precision:
    • class 0 is predicted 89% correctly for all no LoanOnCard customer.
    • class 1 is predicted 95% correctly for all LoanOnCard customer.

4. Performance Improvement:¶

A. Train a base model each for SVM, KNN.¶

In [95]:
X_balanced_Scaled=pd.concat([X_balanced.select_dtypes(include='number').apply(stats.zscore) ,X_balanced[['HiddenScore','Level','FixedDepositAccount']]], axis=1)

X_balanced_scaled_train, X_balanced_scaled_test, y_balanced_scaled_train, y_balanced_scaled_test = train_test_split(X_balanced_Scaled, y_balanced, test_size=0.25, random_state=42, stratify=y_balanced)
In [96]:
# SVM Modelling
model_svm=SVC(gamma=0.1, C=1)
model_svm.fit(X_balanced_scaled_train, y_balanced_scaled_train)
y_svm_balanced_predict=model_svm.predict(X_balanced_scaled_test)

print("Accuracy score Training dataset:{:.2f}".format(model_svm.score(X_balanced_scaled_train, y_balanced_scaled_train)))
print("Accuracy score Testing dataset:{:.2f}".format(model_svm.score(X_balanced_scaled_test, y_balanced_scaled_test)))

sns.heatmap(confusion_matrix(y_balanced_scaled_test, y_svm_balanced_predict), annot=True, fmt='.2f')
print(classification_report(y_balanced_scaled_test,y_svm_balanced_predict))
Accuracy score Training dataset:0.95
Accuracy score Testing dataset:0.94
              precision    recall  f1-score   support

           0       0.95      0.93      0.94       120
           1       0.93      0.95      0.94       120

    accuracy                           0.94       240
   macro avg       0.94      0.94      0.94       240
weighted avg       0.94      0.94      0.94       240

In [97]:
# KNN Modelling
model_knn=KNeighborsClassifier(n_neighbors=int(np.sqrt(len(X_balanced_scaled_train))), metric='euclidean')

model_knn.fit(X_balanced_scaled_train, y_balanced_scaled_train)
y_knn_balanced_predict = model_knn.predict(X_balanced_scaled_test)

print("Value of n_neighbors(K):", int(np.sqrt(len(X_balanced_scaled_train))))
print("Accuracy score Training dataset:{:.2f}".format(model_knn.score(X_balanced_scaled_train, y_balanced_scaled_train)))
print("Accuracy score Training dataset:{:.2f}".format(model_knn.score(X_balanced_scaled_test, y_balanced_scaled_test)))

sns.heatmap(confusion_matrix(y_balanced_scaled_test, y_knn_balanced_predict), annot=True, fmt='.2f')
print("Classification Matrix:\n",classification_report(y_balanced_scaled_test,y_knn_balanced_predict))
Value of n_neighbors(K): 26
Accuracy score Training dataset:0.94
Accuracy score Training dataset:0.93
Classification Matrix:
               precision    recall  f1-score   support

           0       0.93      0.94      0.93       120
           1       0.94      0.93      0.93       120

    accuracy                           0.93       240
   macro avg       0.93      0.93      0.93       240
weighted avg       0.93      0.93      0.93       240

B. Tune parameters for each of the models wherever required and finalize a model.¶

Tune parameters for SVM

In [98]:
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

grid = GridSearchCV(SVC(), param_grid, refit = True)

# fitting the model for grid search
grid.fit(X_balanced_scaled_train, y_balanced_scaled_train)


# print best parameter after tuning
print(grid.best_params_)

# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)
{'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}
SVC(C=100, gamma=0.1)

Different values for C and gamma has been used here for SVM modelling, and best accuracy was provided by c=100, gamma=0.1 and kernel='rbf' which is default kernel.

In [99]:
# SVM Modelling
model_svm=SVC(C=100, gamma=0.1)
model_svm.fit(X_balanced_scaled_train, y_balanced_scaled_train)
y_svm_balanced_predict=model_svm.predict(X_balanced_scaled_test)

print("Accuracy score Training dataset:{:.2f}".format(model_svm.score(X_balanced_scaled_train, y_balanced_scaled_train)))
print("Accuracy score Testing dataset:{:.2f}".format(model_svm.score(X_balanced_scaled_test, y_balanced_scaled_test)))

sns.heatmap(confusion_matrix(y_balanced_scaled_test, y_svm_balanced_predict), annot=True, fmt='.2f')
print(classification_report(y_balanced_scaled_test,y_svm_balanced_predict))
Accuracy score Training dataset:0.98
Accuracy score Testing dataset:0.95
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       120
           1       0.95      0.95      0.95       120

    accuracy                           0.95       240
   macro avg       0.95      0.95      0.95       240
weighted avg       0.95      0.95      0.95       240

Tune parameters for KNN

In [100]:
# defining parameter range
param_grid = {'n_neighbors': np.arange(1,int(np.sqrt(len(X_balanced_scaled_train)))).tolist(),
              'p': [1,2]}

grid = GridSearchCV(KNeighborsClassifier(), param_grid, refit = True)

# fitting the model for grid search
grid.fit(X_balanced_scaled_train, y_balanced_scaled_train)

# print best parameter after tuning
print(grid.best_params_)

# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)
{'n_neighbors': 3, 'p': 1}
KNeighborsClassifier(n_neighbors=3, p=1)

Different K values has been compared from 1 to 26, and k=3 has resulted as best for KNN modeling with metrics as Manhattan Distance.

In [101]:
# KNN Modelling
model_knn1=KNeighborsClassifier(n_neighbors=3, p=1)

model_knn1.fit(X_balanced_scaled_train, y_balanced_scaled_train)
y_knn_balanced_predict1 = model_knn1.predict(X_balanced_scaled_test)

print("Accuracy score Training dataset:{:.2f}".format(model_knn1.score(X_balanced_scaled_train, y_balanced_scaled_train)))
print("Accuracy score Training dataset:{:.2f}".format(model_knn1.score(X_balanced_scaled_test, y_balanced_scaled_test)))

sns.heatmap(confusion_matrix(y_balanced_scaled_test, y_knn_balanced_predict1), annot=True, fmt='.2f')
print("Classification Matrix:\n",classification_report(y_balanced_scaled_test,y_knn_balanced_predict))
Accuracy score Training dataset:0.97
Accuracy score Training dataset:0.95
Classification Matrix:
               precision    recall  f1-score   support

           0       0.93      0.94      0.93       120
           1       0.94      0.93      0.93       120

    accuracy                           0.93       240
   macro avg       0.93      0.93      0.93       240
weighted avg       0.93      0.93      0.93       240

C. Print evaluation metrics for final model.¶

D. Share improvement achieved from base model to final model.¶

As per different models trained, KNN is the best model which have given best results for testing data.

In [102]:
print("Accuracy score Training dataset:{:.2f}".format(model_svm.score(X_balanced_scaled_train, y_balanced_scaled_train)))
print("Accuracy score Testing dataset:{:.2f}".format(model_svm.score(X_balanced_scaled_test, y_balanced_scaled_test)))

sns.heatmap(confusion_matrix(y_balanced_scaled_test, y_svm_balanced_predict), annot=True, fmt='.2f')
print(classification_report(y_balanced_scaled_test,y_svm_balanced_predict))
Accuracy score Training dataset:0.98
Accuracy score Testing dataset:0.95
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       120
           1       0.95      0.95      0.95       120

    accuracy                           0.95       240
   macro avg       0.95      0.95      0.95       240
weighted avg       0.95      0.95      0.95       240

  • Accuracy for testing data is 95%.
  • Precision for customer who haven't taken any loan is 95%
  • Precision for customer who have taken loan is 95%

Base model which was trained for data was LogisticRegression(balanced & unbalanced data), and have provided accuracy as 92% on testing data, whereas SVM has provided 95% on testing data.

As per me, KNN and SVM has provided best values for predicting the potential customers. Precision is highly recommend as data need to be checked highly for falseNegative(FN).